Close

@InProceedings{MaiaVieiPedr:2021:ViRhCo,
               author = "Maia, Helena de Almeida and Vieira, Marcelo Bernardes and Pedrini, 
                         Helio",
          affiliation = "UNICAMP and UFJF and UNICAMP",
                title = "Visual rhythm-based convolutional neural networks and adaptive 
                         fusion for a multi-stream architecture applied to human action 
                         recognition",
            booktitle = "Proceedings...",
                 year = "2021",
               editor = "Paiva, Afonso and Menotti, David and Baranoski, Gladimir V. G. and 
                         Proen{\c{c}}a, Hugo Pedro and Junior, Antonio Lopes Apolinario 
                         and Papa, Jo{\~a}o Paulo and Pagliosa, Paulo and dos Santos, 
                         Thiago Oliveira and e S{\'a}, Asla Medeiros and da Silveira, 
                         Thiago Lopes Trugillo and Brazil, Emilio Vital and Ponti, Moacir 
                         A. and Fernandes, Leandro A. F. and Avila, Sandra",
         organization = "Conference on Graphics, Patterns and Images, 34. (SIBGRAPI)",
            publisher = "Sociedade Brasileira de Computa{\c{c}}{\~a}o",
              address = "Porto Alegre",
             keywords = "action recognition, visual rhythm, multi-stream architecture.",
             abstract = "In this work, we address the problem of human action recognition 
                         in videos. We propose and analyze a multi-stream architecture 
                         containing image-based networks pre-trained on the large ImageNet. 
                         Different image representations are extracted from the videos to 
                         feed the streams, in order to provide complementary information 
                         for the system. Here, we propose new streams based on visual 
                         rhythm that encodes longer-term information when compared to still 
                         frames and optical flow. Our main contribution is a stream based 
                         on a new variant of the visual rhythm called Learnable Visual 
                         Rhythm (LVR) formed by the outputs of a deep network. The features 
                         are collected at multiple depths to enable the analysis of 
                         different abstraction levels. This strategy significantly 
                         outperforms the handcrafted version on the UCF101 and HMDB51 
                         datasets. We also investigate many combinations of the streams to 
                         identify the modalities that better complement each other. 
                         Experiments conducted on the two datasets show that our 
                         multi-stream network achieved competitive results compared to 
                         state-of-the-art approaches.",
  conference-location = "Gramado, RS, Brazil (virtual)",
      conference-year = "18-22 Oct. 2021",
             language = "en",
                  ibi = "8JMKD3MGPEW34M/45CU66B",
                  url = "http://urlib.net/ibi/8JMKD3MGPEW34M/45CU66B",
           targetfile = "camera_ready.pdf",
        urlaccessdate = "2024, Apr. 28"
}


Close